library(data.table)
library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## ── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
I import directly the raw file from github.
customers <- fread("https://raw.githubusercontent.com/SteffiPeTaffy/machineLearningAZ/master/Machine%20Learning%20A-Z%20Template%20Folder/Part%204%20-%20Clustering/Section%2025%20-%20Hierarchical%20Clustering/Mall_Customers.csv")
glimpse(customers)
## Observations: 200
## Variables: 5
## $ CustomerID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …
## $ Genre <chr> "Male", "Male", "Female", "Female", "Fe…
## $ Age <int> 19, 21, 20, 23, 31, 22, 35, 23, 64, 30,…
## $ `Annual Income (k$)` <int> 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,…
## $ `Spending Score (1-100)` <int> 39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14…
summary(customers)
## CustomerID Genre Age Annual Income (k$)
## Min. : 1.00 Length:200 Min. :18.00 Min. : 15.00
## 1st Qu.: 50.75 Class :character 1st Qu.:28.75 1st Qu.: 41.50
## Median :100.50 Mode :character Median :36.00 Median : 61.50
## Mean :100.50 Mean :38.85 Mean : 60.56
## 3rd Qu.:150.25 3rd Qu.:49.00 3rd Qu.: 78.00
## Max. :200.00 Max. :70.00 Max. :137.00
## Spending Score (1-100)
## Min. : 1.00
## 1st Qu.:34.75
## Median :50.00
## Mean :50.20
## 3rd Qu.:73.00
## Max. :99.00
colSums(is.na(customers))
## CustomerID Genre Age
## 0 0 0
## Annual Income (k$) Spending Score (1-100)
## 0 0
There is no NA data. Cool! Then dive into clustering.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
customers$Genre <- as.factor(customers$Genre)
names(customers) <- c('id', 'genre', 'age', 'annual_income', 'spending_score')
customers <- customers %>% select(-id)
str(customers)
## Classes 'data.table' and 'data.frame': 200 obs. of 4 variables:
## $ genre : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
## $ age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ annual_income : int 15 15 16 16 17 17 18 18 19 19 ...
## $ spending_score: int 39 81 6 77 40 76 6 94 3 72 ...
## - attr(*, ".internal.selfref")=<externalptr>
p <- plot_ly(customers, x = ~age, y = ~annual_income, z = ~spending_score, color = ~genre, colors = c('#BF382A', '#0C4B8E')) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'age'),
yaxis = list(title = 'annual_income'),
zaxis = list(title = 'spending_score')))
p
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.